Installing Required Packages
# install required packages/library
install.packages("matlib")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("rsample")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("corrplot")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("MASS")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Warning: package 'MASS' is not available for this version of R
## 'MASS' version 7.3-65 is in the repositories but depends on R (>= 4.4.0)
## 'MASS' version 7.3-65 is in the repositories but depends on R (>= 4.6)
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
install.packages("gridExtra")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("knitr")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("matrixStats")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("ggdensity")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
Loading Libraries
# import required packages/libraries
library(matlib) # for matrix operations
library(ggplot2) # for data visualization
library(rsample) # for data splitting
library(corrplot) # for correlation visualization
## corrplot 0.95 loaded
library(MASS) # for statistical functions
library(gridExtra) # for arranging plots
library(dplyr) # for data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr) # for tables
library(doParallel) # For parallel processing
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(foreach) # For parallel processing (used with doParallel)
library(ggdensity) # Optional: For enhanced 2D density plots/contours
Loading Datasets
# load features dataset (independent variable)
features <- as.matrix(read.csv(file="data/features.csv", header=FALSE))
colnames(features) <- c("x1", "x3", "x4", "x5")
# load target dataset (dependent variable)
target <- as.matrix(read.csv(file="data/target.csv", header=FALSE))
colnames(target) <- c("X2")
# load time series dataset
time <- as.matrix((read.csv(file="data/timeseries.csv", header=FALSE)))
colnames(time) <- c("T1")
# display the first few rows of each dataset
cat("Features Dataset: \n")
## Features Dataset:
head(features)
## x1 x3 x4 x5
## [1,] 8.34 40.77 1010.84 90.01
## [2,] 23.64 58.49 1011.40 74.20
## [3,] 29.74 56.90 1007.15 41.91
## [4,] 19.07 49.69 1007.22 76.79
## [5,] 11.80 40.66 1017.13 97.20
## [6,] 13.97 39.16 1016.05 84.60
cat("\nTarget Dataset: \n")
##
## Target Dataset:
head(target)
## X2
## [1,] 480.48
## [2,] 445.75
## [3,] 438.76
## [4,] 453.09
## [5,] 464.43
## [6,] 470.96
cat("\nTimeseries Dataset: \n")
##
## Timeseries Dataset:
head(features)
## x1 x3 x4 x5
## [1,] 8.34 40.77 1010.84 90.01
## [2,] 23.64 58.49 1011.40 74.20
## [3,] 29.74 56.90 1007.15 41.91
## [4,] 19.07 49.69 1007.22 76.79
## [5,] 11.80 40.66 1017.13 97.20
## [6,] 13.97 39.16 1016.05 84.60
Task 2: Regression Modeling
Task 2.1: Model Parameters
# OLS helper
fit_ls <- function(X, y) {
# Add checks for singular matrix for robust error handling, though ginv helps.
if (det(t(X) %*% X) == 0) {
warning("Matrix (X'X) is singular. OLS coefficients might not be unique.")
}
ginv(t(X) %*% X) %*% t(X) %*% y
}
# function scales columns and handles cases where
# a column might be constant
scale_features_matrix <- function(mat) {
if (is.null(mat) || ncol(mat) == 0) {
return(matrix(nrow = nrow(mat), ncol = 0)) # Return empty matrix if input is empty
}
scaled_mat <- scale(mat)
#
scaled_mat[is.na(scaled_mat)] <- 0
return(scaled_mat)
}
# Build each model and fit
theta_estimates <- list()
# Model 1: y = θ1·x4 + θ2·x3² + θbias
# 1. Create raw feature matrix for this model
X1_raw_features <- cbind(x4 = features[,"x4"],
x3_squared = features[,"x3"]^2)
# 2. Scale these raw features
X1_scaled_features <- scale_features_matrix(X1_raw_features)
# 3. Add the intercept column to the scaled features
X1 <- cbind("(Intercept)" = 1, X1_scaled_features)
# 4. Fit the OLS model
theta_estimates[["Model1"]] <- fit_ls(X1, target)
rownames(theta_estimates[["Model1"]]) <- colnames(X1)
# Model 2: y = θ1·x4 + θ2·x3² + θ3·x5 + θbias
X2_raw_features <- cbind(x4 = features[,"x4"],
x3_squared = features[,"x3"]^2,
x5 = features[,"x5"])
X2_scaled_features <- scale_features_matrix(X2_raw_features)
X2 <- cbind("(Intercept)" = 1, X2_scaled_features)
theta_estimates[["Model2"]] <- fit_ls(X2, target)
rownames(theta_estimates[["Model2"]]) <- colnames(X2)
# Model 3: y = θ1·x3 + θ2·x4 + θ3·x5³ + θbias
X3_raw_features <- cbind(x3 = features[,"x3"],
x4 = features[,"x4"],
x5_cubed = features[,"x5"]^3)
X3_scaled_features <- scale_features_matrix(X3_raw_features)
X3 <- cbind("(Intercept)" = 1, X3_scaled_features)
theta_estimates[["Model3"]] <- fit_ls(X3, target)
rownames(theta_estimates[["Model3"]]) <- colnames(X3)
# Model 4: y = θ1·x4 + θ2·x3² + θ3·x5³ + θbias
X4_raw_features <- cbind(x4 = features[,"x4"],
x3_squared = features[,"x3"]^2,
x5_cubed = features[,"x5"]^3)
X4_scaled_features <- scale_features_matrix(X4_raw_features)
X4 <- cbind("(Intercept)" = 1, X4_scaled_features)
theta_estimates[["Model4"]] <- fit_ls(X4, target)
rownames(theta_estimates[["Model4"]]) <- colnames(X4)
# Model 5: y = θ1·x4 + θ2·x1² + θ3·x3² + θbias
X5_raw_features <- cbind(x4 = features[,"x4"],
x1_squared = features[,"x1"]^2,
x3_squared = features[,"x3"]^2)
X5_scaled_features <- scale_features_matrix(X5_raw_features)
X5 <- cbind("(Intercept)" = 1, X5_scaled_features)
theta_estimates[["Model5"]] <- fit_ls(X5, target)
rownames(theta_estimates[["Model5"]]) <- colnames(X5)
# Print results
for (model in names(theta_estimates)) {
cat("\n", model, "θ̂ =\n")
print(theta_estimates[[model]])
}
##
## Model1 θ̂ =
## X2
## (Intercept) 454.365009
## x4 3.348278
## x3_squared -13.211452
##
## Model2 θ̂ =
## X2
## (Intercept) 454.365009
## x4 3.432657
## x3_squared -12.406622
## x5 2.517326
##
## Model3 θ̂ =
## X2
## (Intercept) 454.365009
## x3 -12.722943
## x4 3.402683
## x5_cubed 2.315840
##
## Model4 θ̂ =
## X2
## (Intercept) 454.365009
## x4 3.487220
## x3_squared -12.402038
## x5_cubed 2.487015
##
## Model5 θ̂ =
## X2
## (Intercept) 454.365009
## x4 1.349107
## x1_squared -10.605331
## x3_squared -5.173124
Task 2.3: Log-likelihood and Variance Calculation
# Constants for Log-Likelihood Calculation
n <- nrow(target) # Number of data samples
ln_2pi <- log(2 * pi) # Pre-calculate ln(2Ï€)
# Compute Log-Likelihood and Variance for each model
log_likelihood_values <- list()
variance_estimates <- list() # List to store calculated variances (sigma_hat_squared)
# Iterate through the RSS values
for (model_name in names(rss_estimates)) {
current_rss <- rss_estimates[[model_name]]
# Only compute log-likelihood if RSS is a valid number
if (!is.na(current_rss) && is.numeric(current_rss)) {
# Calculate sigma_hat_squared
sigma_hat_squared <- current_rss / (n - 1)
variance_estimates[[model_name]] <- sigma_hat_squared # Store the variance
# Check for non-positive or zero sigma_hat_squared before log
if (sigma_hat_squared <= 0) {
warning(paste("Model", model_name, ": sigma_hat_squared is non-positive or zero. Log-likelihood set to NA."))
log_likelihood_values[[model_name]] <- NA
} else {
# Compute the log-likelihood using the given formula
log_likelihood <- - (n / 2) * ln_2pi - (n / 2) * log(sigma_hat_squared) - (1 / (2 * sigma_hat_squared)) * current_rss
log_likelihood_values[[model_name]] <- log_likelihood
}
} else {
log_likelihood_values[[model_name]] <- NA # Set log-likelihood to NA if RSS was invalid
variance_estimates[[model_name]] <- NA # Set variance to NA if RSS was invalid
}
}
cat("\n--- Task 2.3: Log-Likelihood and Variance Estimates ---\n")
##
## --- Task 2.3: Log-Likelihood and Variance Estimates ---
for (model_name in names(log_likelihood_values)) {
cat("\n", model_name, "\n")
cat(" Variance (σ̂²):", variance_estimates[[model_name]], "\n")
cat(" Log-Likelihood:", log_likelihood_values[[model_name]], "\n")
}
##
## Model1
## Variance (σ̂²): 68.69951
## Log-Likelihood: -33810.99
##
## Model2
## Variance (σ̂²): 62.96091
## Log-Likelihood: -33393.69
##
## Model3
## Variance (σ̂²): 57.2271
## Log-Likelihood: -32936.88
##
## Model4
## Variance (σ̂²): 63.09509
## Log-Likelihood: -33403.88
##
## Model5
## Variance (σ̂²): 38.21731
## Log-Likelihood: -31005.4
Task 2.4: Compute AIC and BIC
# Constants for AIC/BIC Calculation
n <- nrow(target) # Number of data samples
# Define 'k' (number of estimated parameters) for each model
k_values <- list(
Model1 = 3, # θ1·x4 + θ2·x3² + θbias
Model2 = 4, # θ1·x4 + θ2·x3² + θ3·x5 + θbias
Model3 = 4, # θ1·x3 + θ2·x4 + θ3·x5³ + θbias
Model4 = 4, # θ1·x4 + θ2·x3² + θ3·x5³ + θbias
Model5 = 4 # θ1·x4 + θ2·x1² + θ3·x3² + θbias
)
# Compute AIC and BIC for each model
aic_values <- list()
bic_values <- list()
# Iterate through the log-likelihood values from Task 2.3
for (model_name in names(log_likelihood_values)) {
current_log_likelihood <- log_likelihood_values[[model_name]]
k_param <- k_values[[model_name]] # Get the 'k' value for the current model
# Only compute AIC/BIC if log-likelihood is a valid number and k is defined
if (!is.na(current_log_likelihood) && is.numeric(current_log_likelihood) &&
!is.null(k_param) && !is.na(k_param) && is.numeric(k_param)) {
# Compute AIC: AIC = 2k - 2 ln p(D|θ̂)
aic <- 2 * k_param - 2 * current_log_likelihood
aic_values[[model_name]] <- aic
# Compute BIC: BIC = k * ln(n) - 2 ln p(D|θ̂)
bic <- k_param * log(n) - 2 * current_log_likelihood
bic_values[[model_name]] <- bic
} else {
aic_values[[model_name]] <- NA # Set to NA if inputs were invalid
bic_values[[model_name]] <- NA # Set to NA if inputs were invalid
}
}
cat("\n--- Task 2.4: Akaike Information Criterion (AIC) & Bayesian Information Criterion (BIC) ---\n")
##
## --- Task 2.4: Akaike Information Criterion (AIC) & Bayesian Information Criterion (BIC) ---
for (model_name in names(aic_values)) {
cat("\n", model_name, "\n")
cat(" AIC:", aic_values[[model_name]], "\n")
cat(" BIC:", bic_values[[model_name]], "\n")
}
##
## Model1
## AIC: 67627.98
## BIC: 67649.48
##
## Model2
## AIC: 66795.38
## BIC: 66824.05
##
## Model3
## AIC: 65881.77
## BIC: 65910.43
##
## Model4
## AIC: 66815.75
## BIC: 66844.42
##
## Model5
## AIC: 62018.79
## BIC: 62047.46
Task 2.5: Distribution of Model Prediction Errors
# Model definitions to reconstruct X matrices for calculating residuals.
model_x_builders <- list(
Model1 = function(features, scale_func) {
X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2)
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
},
Model2 = function(features, scale_func) {
X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2, x5 = features[,"x5"])
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
},
Model3 = function(features, scale_func) {
X_raw <- cbind(x3 = features[,"x3"], x4 = features[,"x4"], x5_cubed = features[,"x5"]^3)
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
},
Model4 = function(features, scale_func) {
X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2, x5_cubed = features[,"x5"]^3)
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
},
Model5 = function(features, scale_func) {
X_raw <- cbind(x4 = features[,"x4"], x1_squared = features[,"x1"]^2, x3_squared = features[,"x3"]^2)
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
}
)
# Loop through each model to calculate residuals and generate Q-Q plots
for (model_name in names(model_x_builders)) {
# Reconstruct the design matrix X using the builder function and global helpers
X <- model_x_builders[[model_name]](features, scale_features_matrix)
# Retrieve the estimated parameters (theta) for the current model
theta <- theta_estimates[[model_name]]
# Check if theta is valid and has correct dimensions before calculating residuals
if (is.null(theta) || any(is.na(theta)) || length(theta) != ncol(X)) {
warning(paste("Cannot compute residuals for", model_name, ": Invalid, missing, or dimension-mismatched theta estimates."))
next
}
# Ensure theta is a matrix for matrix multiplication
if (is.vector(theta)) {
theta <- matrix(theta, ncol = 1)
}
# Calculate predicted values (y_hat)
y_pred <- X %*% theta
# Calculate residuals (prediction errors): y - y_hat
residuals <- target - y_pred
# Flatten residuals to a vector for qqnorm function
residuals_vec <- as.vector(residuals)
# Check for NA/NaN/Inf in residuals before plotting
if (any(is.na(residuals_vec)) || any(is.infinite(residuals_vec))) {
warning(paste("Residuals for", model_name, "contain NA/Inf values. Q-Q plot may not be generated properly."))
next
}
# Plot Q-Q plot
qqnorm(residuals_vec, main = paste("Q-Q Plot of Residuals for", model_name))
qqline(residuals_vec, col = "red")
}





# Compile all model evaluation metrics into a comprehensive table
model_comparison <- data.frame(
Model = names(theta_estimates), # Using names from theta_estimates
Parameters = sapply(theta_estimates, function(theta) length(theta[!is.na(theta)])),
RSS = sapply(rss_estimates, function(rss) ifelse(is.null(rss) || length(rss) == 0 || is.na(rss), NA, rss)),
LogLikelihood = sapply(log_likelihood_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)),
AIC = sapply(aic_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)),
BIC = sapply(bic_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)),
Variance = sapply(variance_estimates, function(val) ifelse(is.null(val) || is.na(val), NA, val))
)
# Display the comprehensive table
cat("\n--- Comprehensive Comparison of All Models ---\n")
##
## --- Comprehensive Comparison of All Models ---
print(kable(model_comparison,
caption = "Comprehensive Comparison of All Models",
digits = c(0, 0, 2, 2, 2, 2, 4)))
##
##
## Table: Comprehensive Comparison of All Models
##
## | |Model | Parameters| RSS| LogLikelihood| AIC| BIC| Variance|
## |:------|:------|----------:|--------:|-------------:|--------:|--------:|--------:|
## |Model1 |Model1 | 3| 657248.2| -33810.99| 67627.98| 67649.48| 68.6995|
## |Model2 |Model2 | 4| 602347.1| -33393.69| 66795.38| 66824.05| 62.9609|
## |Model3 |Model3 | 4| 547491.6| -32936.88| 65881.77| 65910.43| 57.2271|
## |Model4 |Model4 | 4| 603630.7| -33403.88| 66815.75| 66844.42| 63.0951|
## |Model5 |Model5 | 4| 365625.0| -31005.40| 62018.79| 62047.46| 38.2173|
cat("\n") # Add a newline for better separation
# Rank models based on different criteria
# Lower values are better for RSS, AIC, BIC. Higher values are better for LogLikelihood, so we rank negatively.
rank_rss <- rank(model_comparison$RSS, na.last = "keep", ties.method = "min")
rank_aic <- rank(model_comparison$AIC, na.last = "keep", ties.method = "min")
rank_bic <- rank(model_comparison$BIC, na.last = "keep", ties.method = "min")
rank_loglik <- rank(-model_comparison$LogLikelihood, na.last = "keep", ties.method = "min") # Negative because higher is better
ranking_table <- data.frame(
Model = model_comparison$Model,
RSS_Rank = rank_rss,
AIC_Rank = rank_aic,
BIC_Rank = rank_bic,
LogLikelihood_Rank = rank_loglik
)
# Calculate Overall_Rank by summing individual ranks, considering only valid ranks
ranking_table$Overall_Rank <- rowSums(ranking_table[, c("RSS_Rank", "AIC_Rank", "BIC_Rank", "LogLikelihood_Rank")], na.rm = TRUE)
# Sort by overall rank (lower sum is better)
ranking_table <- ranking_table[order(ranking_table$Overall_Rank),]
# Display the ranking table
cat("\n--- Model Ranking Based on Different Criteria (Lower Sum of Ranks is Better) ---\n")
##
## --- Model Ranking Based on Different Criteria (Lower Sum of Ranks is Better) ---
print(kable(ranking_table,
caption = "Model Ranking Based on Different Criteria (Lower is Better)"))
##
##
## Table: Model Ranking Based on Different Criteria (Lower is Better)
##
## | |Model | RSS_Rank| AIC_Rank| BIC_Rank| LogLikelihood_Rank| Overall_Rank|
## |:--|:------|--------:|--------:|--------:|------------------:|------------:|
## |5 |Model5 | 1| 1| 1| 1| 4|
## |3 |Model3 | 2| 2| 2| 2| 8|
## |2 |Model2 | 3| 3| 3| 3| 12|
## |4 |Model4 | 4| 4| 4| 4| 16|
## |1 |Model1 | 5| 5| 5| 5| 20|
cat("\n") # Add a newline for better separation
# Determine the best model based on individual criteria
best_model_aic_index <- which.min(model_comparison$AIC)
best_model_aic_name <- model_comparison$Model[best_model_aic_index]
best_model_bic_index <- which.min(model_comparison$BIC)
best_model_bic_name <- model_comparison$Model[best_model_bic_index]
best_model_loglik_index <- which.max(model_comparison$LogLikelihood)
best_model_loglik_name <- model_comparison$Model[best_model_loglik_index]
best_model_rss_index <- which.min(model_comparison$RSS)
best_model_rss_name <- model_comparison$Model[best_model_rss_index]
cat(paste("Based on AIC, the best model is:", best_model_aic_name, "\n"))
## Based on AIC, the best model is: Model5
cat(paste("Based on BIC, the best model is:", best_model_bic_name, "\n"))
## Based on BIC, the best model is: Model5
cat(paste("Based on Log-Likelihood, the best model is:", best_model_loglik_name, "\n"))
## Based on Log-Likelihood, the best model is: Model5
cat(paste("Based on RSS, the best model is:", best_model_rss_name, "\n"))
## Based on RSS, the best model is: Model5
Train-Test Split
set.seed(123) # Set seed for reproducibility of data splitting
cat("\n--- Task 2.7: Training/Testing Split, Prediction, and Confidence Intervals ---\n")
##
## --- Task 2.7: Training/Testing Split, Prediction, and Confidence Intervals ---
# Split the input (features) and output (target) dataset
sample_size <- nrow(features)
train_indices <- sample(sample_size, size = round(0.7 * sample_size))
test_indices <- setdiff(1:sample_size, train_indices)
# Create training and testing datasets
train_features <- features[train_indices, ]
train_target <- target[train_indices, ]
test_features <- features[test_indices, ]
y_test <- target[test_indices, ]
cat(paste0("Data split: Training samples = ", nrow(train_features), ", Testing samples = ", nrow(test_features), "\n"))
## Data split: Training samples = 6698, Testing samples = 2870
# Use the best model and estimate its parameters using the training dataset
best_model_name <- best_model_aic_name
# Reconstruct the design matrix (X_train) for the best model using training features
if (!exists("model_x_builders") || is.null(model_x_builders[[best_model_name]])) {
stop("model_x_builders or the builder for the best model is not available. Please ensure Task 2.5 code is run first.")
}
X_train_model <- model_x_builders[[best_model_name]](train_features, scale_features_matrix)
# Estimate model parameters use the training dataset
theta_train <- fit_ls(X_train_model, train_target)
rownames(theta_train) <- colnames(X_train_model)
cat(paste0("\nEstimated parameters for ", best_model_name, " using training data:\n"))
##
## Estimated parameters for Model5 using training data:
print(theta_train)
## [,1]
## (Intercept) 454.354339
## x4 1.280183
## x1_squared -10.646216
## x3_squared -5.149130
# Compute the model's output/prediction on the testing data
X_test_model <- model_x_builders[[best_model_name]](test_features, scale_features_matrix)
# Compute predictions on the testing data
y_pred_test <- X_test_model %*% theta_train
# Calculate Test Error Metrics
test_rss <- sum((y_test - y_pred_test)^2)
test_rmse <- sqrt(mean((y_test - y_pred_test)^2))
test_mae <- mean(abs(y_test - y_pred_test))
test_r_squared <- 1 - sum((y_test - y_pred_test)^2) / sum((y_test - mean(y_test))^2)
# Display test metrics
test_metrics <- data.frame(
Metric = c("RSS", "RMSE", "MAE", "R-squared"),
Value = c(test_rss, test_rmse, test_mae, test_r_squared)
)
cat(paste0("\n--- Model Performance on Test Data (", best_model_name, ") ---\n"))
##
## --- Model Performance on Test Data (Model5) ---
print(kable(test_metrics, caption = paste0("Model Performance on Test Data for ", best_model_name)))
##
##
## Table: Model Performance on Test Data for Model5
##
## |Metric | Value|
## |:---------|------------:|
## |RSS | 1.067140e+05|
## |RMSE | 6.097752e+00|
## |MAE | 4.859314e+00|
## |R-squared | 8.726459e-01|
cat("\n")
# Compute 95% (model prediction) confidence intervals
n_train <- nrow(X_train_model)
k_params <- ncol(X_train_model) # Number of estimated parameters including intercept
# Calculate residuals from the training fit to estimate sigma_hat_squared
residuals_train <- train_target - (X_train_model %*% theta_train)
rss_train <- sum(residuals_train^2)
# Estimate variance of residuals (sigma_hat_squared) from training data
sigma_hat_squared_train <- rss_train / (n_train - k_params)
sigma_hat_train <- sqrt(sigma_hat_squared_train)
# Calculate the inverse of (X_train_model'X_train_model)
XTX_inv_train <- tryCatch(
solve(t(X_train_model) %*% X_train_model),
error = function(e) {
warning("Matrix (X'X) is singular for training data. Using ginv from MASS for inverse.")
ginv(t(X_train_model) %*% X_train_model)
}
)
# Degrees of freedom for t-distribution
df <- n_train - k_params
if (df <= 0) {
warning("Degrees of freedom are zero or negative. Cannot compute confidence intervals.")
lower_ci <- rep(NA, nrow(y_pred_test))
upper_ci <- rep(NA, nrow(y_pred_test))
} else {
# T-value for 95% confidence (alpha = 0.05, two-tailed)
t_critical <- qt(0.975, df = df)
# Calculate prediction intervals for each test point
lower_ci <- numeric(nrow(X_test_model))
upper_ci <- numeric(nrow(X_test_model))
for (i in 1:nrow(X_test_model)) {
x0_t <- matrix(X_test_model[i, ], nrow = 1) # Transpose to row vector
# Variance of the prediction for a new observation
pred_variance <- sigma_hat_squared_train * (1 + x0_t %*% XTX_inv_train %*% t(x0_t))
pred_sd <- sqrt(pred_variance)
margin_of_error <- t_critical * pred_sd
lower_ci[i] <- y_pred_test[i] - margin_of_error
upper_ci[i] <- y_pred_test[i] + margin_of_error
}
}
# Convert to vectors for plotting consistency in ggplot
y_test_vec <- as.vector(y_test)
y_pred_test_vec <- as.vector(y_pred_test)
# Prepare data for ggplot
plot_data <- data.frame(
index = seq_along(y_test_vec), # Use seq_along on the vector
Actual = y_test_vec, # Use the vector directly
Predicted = y_pred_test_vec, # Use the vector directly
LowerCI = lower_ci,
UpperCI = upper_ci
)
# Generate ggplot
ggplot(plot_data, aes(x = index)) +
geom_ribbon(aes(ymin = LowerCI, ymax = UpperCI, fill = "95% CI"), alpha = 0.2, na.rm = TRUE) + # Shaded CI
geom_line(aes(y = Predicted, color = "Predicted"), linetype = "dashed", size = 0.8, na.rm = TRUE) + # Predicted line
geom_point(aes(y = Actual, color = "Actual"), size = 1.0, alpha = 0.7, na.rm = TRUE) + # Actual points
scale_color_manual(name = "colour",
values = c("Actual" = "blue", "Predicted" = "red")) +
scale_fill_manual(name = "fill", values = c("95% CI" = "grey")) +
labs(
title = paste0(best_model_name, " : Actual vs. Predicted with 95% CI (Test Set)"), # Match title style
x = "index", # Match x-axis label style
y = "Net hourly output" # Match y-axis label style
) +
theme_minimal() + # Use a minimal theme
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.text = element_text(size = 9)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Task 3: Approximate Bayesian Computation (ABC)
Posterior Distribution and Parameter Estimation
# Specify Model 5 as the best model
best_model_name_abc <- "Model5"
cat(paste0("Using '", best_model_name_abc, "' as the best model for ABC, as instructed.\n"))
## Using 'Model5' as the best model for ABC, as instructed.
# Ensure the model_x_builder for Model 5 is available
if (!exists("model_x_builders") || is.null(model_x_builders[[best_model_name_abc]])) {
stop("model_x_builders or the builder for Model5 is not available. Please ensure Task 2.5 code is run first.")
}
# Construct the full design matrix for Model 5 using the entire dataset
X_model_5_full <- model_x_builders[[best_model_name_abc]](features, scale_features_matrix)
# Estimate parameters for Model 5 using the full dataset
theta_estimate_model_5_full <- fit_ls(X_model_5_full, target)
rownames(theta_estimate_model_5_full) <- colnames(X_model_5_full)
cat(paste0("\nLeast squares estimates for ", best_model_name_abc, " (full dataset):\n"))
##
## Least squares estimates for Model5 (full dataset):
print(theta_estimate_model_5_full)
## X2
## (Intercept) 454.365009
## x4 1.349107
## x1_squared -10.605331
## x3_squared -5.173124
# Convert to a data frame for easier manipulation and sorting
theta_df <- data.frame(
Parameter = rownames(theta_estimate_model_5_full),
Estimate = as.vector(theta_estimate_model_5_full)
)
theta_df$AbsEstimate <- abs(theta_df$Estimate)
theta_df_ordered <- theta_df[order(-theta_df$AbsEstimate), ]
# Select the top 2 parameters with largest absolute values to vary
params_to_vary_info <- theta_df_ordered[1:2, ]
params_to_vary_names <- params_to_vary_info$Parameter
# The remaining parameters are fixed to their estimated values
fixed_params_info <- theta_df_ordered[3:nrow(theta_df_ordered), ]
fixed_params_values <- setNames(fixed_params_info$Estimate, fixed_params_info$Parameter)
cat(paste0("\nParameters to vary in ABC (2 with largest absolute LS estimates):\n"))
##
## Parameters to vary in ABC (2 with largest absolute LS estimates):
print(params_to_vary_info)
## Parameter Estimate AbsEstimate
## 1 (Intercept) 454.36501 454.36501
## 3 x1_squared -10.60533 10.60533
cat(paste0("\nFixed parameters (and their estimated values):\n"))
##
## Fixed parameters (and their estimated values):
print(fixed_params_values)
## x3_squared x4
## -5.173124 1.349107
# Retrieve the LS estimates for the two parameters to be varied for prior centering
prior_mean_param1 <- theta_estimate_model_5_full[params_to_vary_names[1], 1]
prior_mean_param2 <- theta_estimate_model_5_full[params_to_vary_names[2], 1]
# Define uniform prior ranges: centered around LS estimate, with a minimum width
prior_range_half_param1 <- max(5 * abs(prior_mean_param1), 0.5) # Ensures exploration even if LS estimate is small
prior_range_half_param2 <- max(5 * abs(prior_mean_param2), 0.5)
prior_param1_min <- prior_mean_param1 - prior_range_half_param1
prior_param1_max <- prior_mean_param1 + prior_range_half_param1
prior_param2_min <- prior_mean_param2 - prior_range_half_param2
prior_param2_max <- prior_mean_param2 + prior_range_half_param2
priors_abc <- list(
param1 = list(name = params_to_vary_names[1], type = "uniform", min = prior_param1_min, max = prior_param1_max),
param2 = list(name = params_to_vary_names[2], type = "uniform", min = prior_param2_min, max = prior_param2_max)
)
cat("\nUniform prior distributions defined for the varying parameters:\n")
##
## Uniform prior distributions defined for the varying parameters:
print(priors_abc)
## $param1
## $param1$name
## [1] "(Intercept)"
##
## $param1$type
## [1] "uniform"
##
## $param1$min
## [1] -1817.46
##
## $param1$max
## [1] 2726.19
##
##
## $param2
## $param2$name
## [1] "x1_squared"
##
## $param2$type
## [1] "uniform"
##
## $param2$min
## [1] -63.63199
##
## $param2$max
## [1] 42.42132
# Function to sample a single set of parameters from the defined priors
sample_from_priors <- function() {
param_samples <- c()
param_samples[priors_abc$param1$name] <- runif(1, priors_abc$param1$min, priors_abc$param1$max)
param_samples[priors_abc$param2$name] <- runif(1, priors_abc$param2$min, priors_abc$param2$max)
return(param_samples)
}
# Calculate the observed summary statistic (RSS) for the full model on the full data,
y_pred_obs_full_model <- X_model_5_full %*% theta_estimate_model_5_full
residuals_obs_full_model <- target - y_pred_obs_full_model
observed_rss_full_model <- sum(residuals_obs_full_model^2)
cat(paste0("\nObserved RSS (S_obs) of ", best_model_name_abc, " on full dataset: ", observed_rss_full_model, "\n"))
##
## Observed RSS (S_obs) of Model5 on full dataset: 365624.974504426
Sample from ABC
# Function to calculate RSS for a given set of simulated parameters.
calculate_simulated_rss <- function(simulated_theta_vector, X_matrix, observed_y) {
# Compute predictions using the simulated parameters and the design matrix
y_simulated <- X_matrix %*% simulated_theta_vector
# Calculate residuals by comparing simulated predictions to the actual observed data
residuals_simulated <- observed_y - y_simulated
# Compute the sum of squared residuals
rss <- sum(residuals_simulated^2)
return(rss)
}
cat(paste0("Summary statistic chosen: Residual Sum of Squares (RSS).\n"))
## Summary statistic chosen: Residual Sum of Squares (RSS).
cat(paste0("The observed RSS (S_obs) calculated in the previous block is: ", observed_rss_full_model, "\n"))
## The observed RSS (S_obs) calculated in the previous block is: 365624.974504426
Implement Rejection ABC Algorithm
# Define ABC parameters
num_simulations <- 1000 # Total number of parameter sets to propose and simulate
num_accepted_samples <- 1000 # Desired number of samples for the posterior distribution
# Setup parallel processing to speed up simulations
num_cores <- detectCores() - 1 # Use all but one core to leave one for system/other tasks
if (num_cores < 1) num_cores <- 1 # Ensure at least one core is used
cl <- makeCluster(num_cores)
registerDoParallel(cl)
cat(paste0("Registered ", num_cores, " cores for parallel processing of ABC simulations.\n"))
## Registered 7 cores for parallel processing of ABC simulations.
# Get the full design matrix for the selected model (Model 5) and the observed target
X_current_model_full <- model_x_builders[[best_model_name_abc]](features, scale_features_matrix)
y_observed_full <- target
set.seed(42) # Set seed for reproducibility of random sampling within ABC
# Run the ABC simulations in parallel
results <- foreach(
k = 1:num_simulations,
.combine = 'rbind', # Combine results row-wise into a data frame
.packages = c('matrixStats')
) %dopar% {
# Sample parameters from the prior distribution
proposed_theta_varying <- sample_from_priors()
# Construct the full proposed theta vector
# Start with the original LS estimates (which include fixed parameters)
# Then overwrite the varying parameters with the newly sampled ones
proposed_theta_full_sim <- theta_estimate_model_5_full
proposed_theta_full_sim[params_to_vary_names[1], 1] <- proposed_theta_varying[params_to_vary_names[1]]
proposed_theta_full_sim[params_to_vary_names[2], 1] <- proposed_theta_varying[params_to_vary_names[2]]
# Calculate the summary statistic for the simul ated data
simulated_rss <- calculate_simulated_rss(
simulated_theta_vector = proposed_theta_full_sim,
X_matrix = X_current_model_full,
observed_y = y_observed_full
)
# Calculate the distance between the observed and simulated summary statistics
distance <- abs(observed_rss_full_model - simulated_rss)
# Return the proposed parameters and their distance
data.frame(
param1_val = proposed_theta_varying[params_to_vary_names[1]],
param2_val = proposed_theta_varying[params_to_vary_names[2]],
distance = distance
)
}
# Stop the parallel cluster after simulations are complete
stopCluster(cl)
cat("\nParallel processing stopped.\n")
##
## Parallel processing stopped.
# Sort the results by distance and select the 'num_accepted_samples' with the smallest distances
results_sorted <- results[order(results$distance), ]
accepted_samples_df <- results_sorted[1:num_accepted_samples, ]
# The final epsilon is determined by the largest distance among the accepted samples
final_epsilon <- max(accepted_samples_df$distance)
cat(paste0("\nABC Algorithm finished.\n"))
##
## ABC Algorithm finished.
cat(paste0("Total simulations performed: ", num_simulations, "\n"))
## Total simulations performed: 1000
cat(paste0("Number of accepted posterior samples: ", nrow(accepted_samples_df), "\n"))
## Number of accepted posterior samples: 1000
cat(paste0("Final effective epsilon (tolerance for accepted samples): ", final_epsilon, "\n"))
## Final effective epsilon (tolerance for accepted samples): 49359907374.0912
# Store the accepted parameter samples for subsequent plotting
posterior_samples <- accepted_samples_df[, 1:2]
colnames(posterior_samples) <- params_to_vary_names
cat("\n--- Task 3 - Sub-question 4 (Continued): Plot Marginal and Joint Posterior Distributions ---\n")
##
## --- Task 3 - Sub-question 4 (Continued): Plot Marginal and Joint Posterior Distributions ---
# Convert the collected posterior samples into a data frame suitable for ggplot
posterior_df <- as.data.frame(posterior_samples)
# Plot Marginal Posterior Distribution for the First Varying Parameter
p_marginal1 <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[1]]])) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightblue", color = "black") +
geom_density(color = "darkblue", linewidth = 1) +
labs(
title = paste("Marginal Posterior for Parameter:", params_to_vary_names[1]),
x = paste("Value of", params_to_vary_names[1]),
y = "Density"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
print(p_marginal1)

# Plot Marginal Posterior Distribution for the Second Varying Parameter
p_marginal2 <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[2]]])) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightgreen", color = "black") +
geom_density(color = "darkgreen", linewidth = 1) +
labs(
title = paste("Marginal Posterior for Parameter:", params_to_vary_names[2]),
x = paste("Value of", params_to_vary_names[2]),
y = "Density"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
print(p_marginal2)

# Plot the Joint Posterior Distribution of the Two Varying Parameters
p_joint <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[1]]], y = .data[[params_to_vary_names[2]]])) +
geom_point(alpha = 0.2, color = "purple") +
geom_density_2d(color = "darkblue", linewidth = 0.8) +
labs(
title = paste("Joint Posterior Distribution of", params_to_vary_names[1], "and", params_to_vary_names[2]),
x = paste("Value of", params_to_vary_names[1]),
y = paste("Value of", params_to_vary_names[2])
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
print(p_joint)
